Spark-submit needs option
--packages com.databricks:spark-avro_2.10:2.0.1
In [1]:
# load a dataframe from Avro files
df = sqlContext.read.format("com.databricks.spark.avro").load("/cms/wmarchive/test/avro/2016/01/01/")
In [3]:
%%bash
hadoop fs -du -h -s /cms/wmarchive/test/avro/2016/01/01/
In [4]:
sqlContext.setConf("spark.sql.avro.compression.codec", "snappy")
df.write.format("com.databricks.spark.avro").save("wmarchive/test-avro-snappy-20160101")
In [5]:
%%bash
hadoop fs -du -h -s wmarchive/test-avro-snappy-20160101
In [9]:
dfSnappy = sqlContext.read.format("com.databricks.spark.avro").load("wmarchive/test-avro-snappy-20160101")
In [12]:
%%time
df.count()
Out[12]:
In [13]:
%%time
dfSnappy.count()
Out[13]:
In [14]:
# GZIP
sqlContext.setConf("spark.sql.avro.compression.codec", "deflate")
df.write.format("com.databricks.spark.avro").save("wmarchive/test-avro-deflate-20160101")
In [1]:
%%bash
hadoop fs -du -h -s wmarchive/test-avro-deflate-20160101
In [15]:
dfDeflate = sqlContext.read.format("com.databricks.spark.avro").load("wmarchive/test-avro-deflate-20160101")
In [16]:
%%time
dfDeflate.count()
Out[16]:
In [ ]: